Machine Learning Project¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.linear_model import SGDRegressor
import joblib
Styling¶
def styled_describe(df):
styled = df.describe().style.set_table_styles([
{'selector': 'th', 'props': [('background', '#606060'),
('color', 'white'),
('font-family', 'verdana')]},
{'selector': 'td', 'props': [('font-family', 'verdana')]},
])
return styled
layout_options = {
'paper_bgcolor':"#383838",
'plot_bgcolor':'#383838',
'title_font': dict(color='white'),
'legend_font': dict(color='white'),
'yaxis':dict(color="white"),
'xaxis':dict(color="white")
}
1. Data Preprocessing and Exploration¶
1.1 Data Loading and Exploring¶
df = pd.read_csv("nyc-rolling-sales.csv")
df.head()
| Unnamed: 0 | BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | ... | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | SALE DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 392 | 6 | C2 | 153 AVENUE B | ... | 5 | 0 | 5 | 1633 | 6440 | 1900 | 2 | C2 | 6625000 | 2017-07-19 00:00:00 | |
| 1 | 5 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 26 | C7 | 234 EAST 4TH STREET | ... | 28 | 3 | 31 | 4616 | 18690 | 1900 | 2 | C7 | - | 2016-12-14 00:00:00 | |
| 2 | 6 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 39 | C7 | 197 EAST 3RD STREET | ... | 16 | 1 | 17 | 2212 | 7803 | 1900 | 2 | C7 | - | 2016-12-09 00:00:00 | |
| 3 | 7 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2B | 402 | 21 | C4 | 154 EAST 7TH STREET | ... | 10 | 0 | 10 | 2272 | 6794 | 1913 | 2 | C4 | 3936272 | 2016-09-23 00:00:00 | |
| 4 | 8 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 404 | 55 | C2 | 301 EAST 10TH STREET | ... | 6 | 0 | 6 | 2369 | 4615 | 1900 | 2 | C2 | 8000000 | 2016-11-17 00:00:00 |
5 rows × 22 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 84548 entries, 0 to 84547 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 84548 non-null int64 1 BOROUGH 84548 non-null int64 2 NEIGHBORHOOD 84548 non-null object 3 BUILDING CLASS CATEGORY 84548 non-null object 4 TAX CLASS AT PRESENT 84548 non-null object 5 BLOCK 84548 non-null int64 6 LOT 84548 non-null int64 7 EASE-MENT 84548 non-null object 8 BUILDING CLASS AT PRESENT 84548 non-null object 9 ADDRESS 84548 non-null object 10 APARTMENT NUMBER 84548 non-null object 11 ZIP CODE 84548 non-null int64 12 RESIDENTIAL UNITS 84548 non-null int64 13 COMMERCIAL UNITS 84548 non-null int64 14 TOTAL UNITS 84548 non-null int64 15 LAND SQUARE FEET 84548 non-null object 16 GROSS SQUARE FEET 84548 non-null object 17 YEAR BUILT 84548 non-null int64 18 TAX CLASS AT TIME OF SALE 84548 non-null int64 19 BUILDING CLASS AT TIME OF SALE 84548 non-null object 20 SALE PRICE 84548 non-null object 21 SALE DATE 84548 non-null object dtypes: int64(10), object(12) memory usage: 14.2+ MB
test_data_after_save_model = df.iloc[-5:]
df = df.iloc[:-5] # All except last 5 rows
test_data_after_save_model.head()
| Unnamed: 0 | BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | ... | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | SALE DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 84543 | 8409 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7349 | 34 | B9 | 37 QUAIL LANE | ... | 2 | 0 | 2 | 2400 | 2575 | 1998 | 1 | B9 | 450000 | 2016-11-28 00:00:00 | |
| 84544 | 8410 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7349 | 78 | B9 | 32 PHEASANT LANE | ... | 2 | 0 | 2 | 2498 | 2377 | 1998 | 1 | B9 | 550000 | 2017-04-21 00:00:00 | |
| 84545 | 8411 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7351 | 60 | B2 | 49 PITNEY AVENUE | ... | 2 | 0 | 2 | 4000 | 1496 | 1925 | 1 | B2 | 460000 | 2017-07-05 00:00:00 | |
| 84546 | 8412 | 5 | WOODROW | 22 STORE BUILDINGS | 4 | 7100 | 28 | K6 | 2730 ARTHUR KILL ROAD | ... | 0 | 7 | 7 | 208033 | 64117 | 2001 | 4 | K6 | 11693337 | 2016-12-21 00:00:00 | |
| 84547 | 8413 | 5 | WOODROW | 35 INDOOR PUBLIC AND CULTURAL FACILITIES | 4 | 7105 | 679 | P9 | 155 CLAY PIT ROAD | ... | 0 | 1 | 1 | 10796 | 2400 | 2006 | 4 | P9 | 69300 | 2016-10-27 00:00:00 |
5 rows × 22 columns
df.tail()
| Unnamed: 0 | BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | ... | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | SALE DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 84538 | 8404 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7316 | 61 | B2 | 178 DARNELL LANE | ... | 2 | 0 | 2 | 3215 | 1300 | 1995 | 1 | B2 | - | 2017-06-30 00:00:00 | |
| 84539 | 8405 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7316 | 85 | B2 | 137 DARNELL LANE | ... | 2 | 0 | 2 | 3016 | 1300 | 1995 | 1 | B2 | - | 2016-12-30 00:00:00 | |
| 84540 | 8406 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7316 | 93 | B2 | 125 DARNELL LANE | ... | 2 | 0 | 2 | 3325 | 1300 | 1995 | 1 | B2 | 509000 | 2016-10-31 00:00:00 | |
| 84541 | 8407 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7317 | 126 | B2 | 112 ROBIN COURT | ... | 2 | 0 | 2 | 11088 | 2160 | 1994 | 1 | B2 | 648000 | 2016-12-07 00:00:00 | |
| 84542 | 8408 | 5 | WOODROW | 02 TWO FAMILY DWELLINGS | 1 | 7339 | 41 | B9 | 41 SONIA COURT | ... | 2 | 0 | 2 | 3020 | 1800 | 1997 | 1 | B9 | - | 2016-12-01 00:00:00 |
5 rows × 22 columns
1.2 Data Cleaning¶
drop unimportant column Unnamed: 0
df.drop('Unnamed: 0',axis=1,inplace=True)
drop duplicates
df.duplicated().sum()
df = df.drop_duplicates()
df.head()
| BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | APARTMENT NUMBER | ... | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | SALE DATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 392 | 6 | C2 | 153 AVENUE B | ... | 5 | 0 | 5 | 1633 | 6440 | 1900 | 2 | C2 | 6625000 | 2017-07-19 00:00:00 | ||
| 1 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 26 | C7 | 234 EAST 4TH STREET | ... | 28 | 3 | 31 | 4616 | 18690 | 1900 | 2 | C7 | - | 2016-12-14 00:00:00 | ||
| 2 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 39 | C7 | 197 EAST 3RD STREET | ... | 16 | 1 | 17 | 2212 | 7803 | 1900 | 2 | C7 | - | 2016-12-09 00:00:00 | ||
| 3 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2B | 402 | 21 | C4 | 154 EAST 7TH STREET | ... | 10 | 0 | 10 | 2272 | 6794 | 1913 | 2 | C4 | 3936272 | 2016-09-23 00:00:00 | ||
| 4 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 404 | 55 | C2 | 301 EAST 10TH STREET | ... | 6 | 0 | 6 | 2369 | 4615 | 1900 | 2 | C2 | 8000000 | 2016-11-17 00:00:00 |
5 rows × 21 columns
Handle Empty and Whitspaces values¶
# Identify object columns
object_columns = df.select_dtypes(include=['object']).columns
# Function to replace whitespace and empty values with NaN
def replace_whitespace_and_empty(val):
if isinstance(val, str) and (val.strip() == '' or val == ''):
return np.nan
return val
# Apply the function to each object column
for col in object_columns:
df[col] = df[col].apply(replace_whitespace_and_empty)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 83778 entries, 0 to 84542 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BOROUGH 83778 non-null int64 1 NEIGHBORHOOD 83778 non-null object 2 BUILDING CLASS CATEGORY 83778 non-null object 3 TAX CLASS AT PRESENT 83040 non-null object 4 BLOCK 83778 non-null int64 5 LOT 83778 non-null int64 6 EASE-MENT 0 non-null float64 7 BUILDING CLASS AT PRESENT 83040 non-null object 8 ADDRESS 83778 non-null object 9 APARTMENT NUMBER 18849 non-null object 10 ZIP CODE 83778 non-null int64 11 RESIDENTIAL UNITS 83778 non-null int64 12 COMMERCIAL UNITS 83778 non-null int64 13 TOTAL UNITS 83778 non-null int64 14 LAND SQUARE FEET 83778 non-null object 15 GROSS SQUARE FEET 83778 non-null object 16 YEAR BUILT 83778 non-null int64 17 TAX CLASS AT TIME OF SALE 83778 non-null int64 18 BUILDING CLASS AT TIME OF SALE 83778 non-null object 19 SALE PRICE 83778 non-null object 20 SALE DATE 83778 non-null object dtypes: float64(1), int64(9), object(11) memory usage: 14.1+ MB
Let’s create a hybrid approach for the “SALE DATE” feature. We’ll extract both numeric and categorical components from the date.¶
# Convert 'Sale Date' to datetime format
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
# Extract numeric features
df['DayOfMonth'] = df['SALE DATE'].dt.day
df['Month'] = df['SALE DATE'].dt.month
df['Year'] = df['SALE DATE'].dt.year
# Extract categorical features
df['DayOfWeek'] = df['SALE DATE'].dt.day_name()
df['Season'] = df['SALE DATE'].dt.month.apply(lambda x: 'Spring' if 3 <= x <= 5
else ('Summer' if 6 <= x <= 8
else ('Fall' if 9 <= x <= 11
else 'Winter')))
df.drop('SALE DATE',axis=1,inplace=True)
df.head()
| BOROUGH | NEIGHBORHOOD | BUILDING CLASS CATEGORY | TAX CLASS AT PRESENT | BLOCK | LOT | EASE-MENT | BUILDING CLASS AT PRESENT | ADDRESS | APARTMENT NUMBER | ... | GROSS SQUARE FEET | YEAR BUILT | TAX CLASS AT TIME OF SALE | BUILDING CLASS AT TIME OF SALE | SALE PRICE | DayOfMonth | Month | Year | DayOfWeek | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 392 | 6 | NaN | C2 | 153 AVENUE B | NaN | ... | 6440 | 1900 | 2 | C2 | 6625000 | 19 | 7 | 2017 | Wednesday | Summer |
| 1 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 26 | NaN | C7 | 234 EAST 4TH STREET | NaN | ... | 18690 | 1900 | 2 | C7 | - | 14 | 12 | 2016 | Wednesday | Winter |
| 2 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2 | 399 | 39 | NaN | C7 | 197 EAST 3RD STREET | NaN | ... | 7803 | 1900 | 2 | C7 | - | 9 | 12 | 2016 | Friday | Winter |
| 3 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2B | 402 | 21 | NaN | C4 | 154 EAST 7TH STREET | NaN | ... | 6794 | 1913 | 2 | C4 | 3936272 | 23 | 9 | 2016 | Friday | Fall |
| 4 | 1 | ALPHABET CITY | 07 RENTALS - WALKUP APARTMENTS | 2A | 404 | 55 | NaN | C2 | 301 EAST 10TH STREET | NaN | ... | 4615 | 1900 | 2 | C2 | 8000000 | 17 | 11 | 2016 | Thursday | Fall |
5 rows × 25 columns
Now we have additional features:
- DayOfMonth (numeric)
- Month (numeric)
- Year (numeric)
- DayOfWeek (categorical)
- Season (categorical)
Separate features¶
Some numerical features like SALE PRICE considered as object type but it must be integer or float so we had to separate the features manually¶
numeric_features_columns = [
'EASE-MENT',
'APARTMENT NUMBER',
'BLOCK',
'LOT',
'ZIP CODE',
'RESIDENTIAL UNITS',
'COMMERCIAL UNITS',
'TOTAL UNITS',
'LAND SQUARE FEET',
'GROSS SQUARE FEET',
'SALE PRICE',
]
categorical_features_columns = [
'BOROUGH',
'NEIGHBORHOOD',
'BUILDING CLASS CATEGORY',
'TAX CLASS AT PRESENT',
'BUILDING CLASS AT PRESENT',
'ADDRESS',
'BUILDING CLASS AT TIME OF SALE',
'TAX CLASS AT TIME OF SALE',
'DayOfMonth',
'Month',
'Year',
'DayOfWeek',
'Season',
'YEAR BUILT',
]
# Convert non-numeric columns to numeric and replace non-numeric values with NaN
for col in numeric_features_columns:
df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 83778 entries, 0 to 84542 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BOROUGH 83778 non-null int64 1 NEIGHBORHOOD 83778 non-null object 2 BUILDING CLASS CATEGORY 83778 non-null object 3 TAX CLASS AT PRESENT 83040 non-null object 4 BLOCK 83778 non-null int16 5 LOT 83778 non-null int16 6 EASE-MENT 0 non-null float64 7 BUILDING CLASS AT PRESENT 83040 non-null object 8 ADDRESS 83778 non-null object 9 APARTMENT NUMBER 3802 non-null float64 10 ZIP CODE 83778 non-null int16 11 RESIDENTIAL UNITS 83778 non-null int16 12 COMMERCIAL UNITS 83778 non-null int16 13 TOTAL UNITS 83778 non-null int16 14 LAND SQUARE FEET 57724 non-null float64 15 GROSS SQUARE FEET 56393 non-null float64 16 YEAR BUILT 83778 non-null int64 17 TAX CLASS AT TIME OF SALE 83778 non-null int64 18 BUILDING CLASS AT TIME OF SALE 83778 non-null object 19 SALE PRICE 69602 non-null float64 20 DayOfMonth 83778 non-null int32 21 Month 83778 non-null int32 22 Year 83778 non-null int32 23 DayOfWeek 83778 non-null object 24 Season 83778 non-null object dtypes: float64(5), int16(6), int32(3), int64(3), object(8) memory usage: 12.8+ MB
Colmn EASE-MENT is all empty and column APARTMENT NUMBER is almost empty so we will drop them¶
numeric_features_columns = [
'BLOCK',
'LOT',
'ZIP CODE',
'RESIDENTIAL UNITS',
'COMMERCIAL UNITS',
'TOTAL UNITS',
'LAND SQUARE FEET',
'GROSS SQUARE FEET',
'SALE PRICE',
]
df.drop(['APARTMENT NUMBER', 'EASE-MENT'], axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 83778 entries, 0 to 84542 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BOROUGH 83778 non-null int64 1 NEIGHBORHOOD 83778 non-null object 2 BUILDING CLASS CATEGORY 83778 non-null object 3 TAX CLASS AT PRESENT 83040 non-null object 4 BLOCK 83778 non-null int16 5 LOT 83778 non-null int16 6 BUILDING CLASS AT PRESENT 83040 non-null object 7 ADDRESS 83778 non-null object 8 ZIP CODE 83778 non-null int16 9 RESIDENTIAL UNITS 83778 non-null int16 10 COMMERCIAL UNITS 83778 non-null int16 11 TOTAL UNITS 83778 non-null int16 12 LAND SQUARE FEET 57724 non-null float64 13 GROSS SQUARE FEET 56393 non-null float64 14 YEAR BUILT 83778 non-null int64 15 TAX CLASS AT TIME OF SALE 83778 non-null int64 16 BUILDING CLASS AT TIME OF SALE 83778 non-null object 17 SALE PRICE 69602 non-null float64 18 DayOfMonth 83778 non-null int32 19 Month 83778 non-null int32 20 Year 83778 non-null int32 21 DayOfWeek 83778 non-null object 22 Season 83778 non-null object dtypes: float64(3), int16(6), int32(3), int64(3), object(8) memory usage: 11.5+ MB
Tax Class at Present:
Every property in the city is assigned to one of four tax classes (Classes 1, 2, 3, and 4), based on the use of the property.
• Class 1: Includes most residential property of up to three units (such as one-, two-, and three-family homes and small stores or offices with one or two attached apartments), vacant land that is zoned for residential use, and most condominiums that are not more than three stories.
• Class 2: Includes all other property that is primarily residential, such as cooperatives and condominiums.
• Class 3: Includes property with equipment owned by a gas, telephone or electric company.
• Class 4: Includes all other properties not included in class 1,2, and 3, such as offices, factories, warehouses, garage buildings, etc.
so we will filter any value that is not in that one of that four values
# First convert the column to numeric and replace non-numeric values with NaN
df['TAX CLASS AT PRESENT'] = pd.to_numeric(df['TAX CLASS AT PRESENT'], errors='coerce', downcast='integer')
# Valid tax classes
valid_tax_classes = {1, 2, 3, 4}
# Filter the values based on valid tax classes
df['TAX CLASS AT PRESENT'] = df['TAX CLASS AT PRESENT'].apply(lambda x: x if x in valid_tax_classes else np.nan)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 83778 entries, 0 to 84542 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BOROUGH 83778 non-null int64 1 NEIGHBORHOOD 83778 non-null object 2 BUILDING CLASS CATEGORY 83778 non-null object 3 TAX CLASS AT PRESENT 75008 non-null float64 4 BLOCK 83778 non-null int16 5 LOT 83778 non-null int16 6 BUILDING CLASS AT PRESENT 83040 non-null object 7 ADDRESS 83778 non-null object 8 ZIP CODE 83778 non-null int16 9 RESIDENTIAL UNITS 83778 non-null int16 10 COMMERCIAL UNITS 83778 non-null int16 11 TOTAL UNITS 83778 non-null int16 12 LAND SQUARE FEET 57724 non-null float64 13 GROSS SQUARE FEET 56393 non-null float64 14 YEAR BUILT 83778 non-null int64 15 TAX CLASS AT TIME OF SALE 83778 non-null int64 16 BUILDING CLASS AT TIME OF SALE 83778 non-null object 17 SALE PRICE 69602 non-null float64 18 DayOfMonth 83778 non-null int32 19 Month 83778 non-null int32 20 Year 83778 non-null int32 21 DayOfWeek 83778 non-null object 22 Season 83778 non-null object dtypes: float64(4), int16(6), int32(3), int64(3), object(7) memory usage: 11.5+ MB
Year Built:
Since the data was built in 2017 so we filter any value after that year and since before 1850 there was no NewYork so we filter any value before that year
# Filter out the illogical values
df['YEAR BUILT'] = df['YEAR BUILT'].apply(lambda x: x if x >= 1850 and x <= 2017 else np.nan)
print('Min Year: {0} Max Year: {1}'.format(df['YEAR BUILT'].max(), df['YEAR BUILT'].min()))
Min Year: 2017.0 Max Year: 1850.0
Year:
The feature represent the year which the property was sold so we will check for every record record that contains the date of sale before the date of build and make the date of sale in the same year of the date build
df.loc[df['Year'] < df['YEAR BUILT'], 'Year'] = df['YEAR BUILT']
Zip Code:
The data was for NewYork city, so we have to filter out all zip codes that don't belong to NewYork
nyc_zip_range = (10001, 11697)
df = df[(df['ZIP CODE'] >= nyc_zip_range[0]) & (df['ZIP CODE'] <= nyc_zip_range[1])]
1.3 Handling missing values¶
# Define transformers
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
])
# Define the preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features_columns),
('cat', categorical_transformer, categorical_features_columns)
]
)
# Apply the preprocessor to the data
data_processed = preprocessor.fit_transform(df)
# Convert the processed data back to a DataFrame
numeric_cols = preprocessor.transformers_[0][2]
categorical_cols = preprocessor.transformers_[1][2]
df = pd.DataFrame(data_processed, columns=numeric_cols + list(categorical_cols))
df.head()
| BLOCK | LOT | ZIP CODE | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | SALE PRICE | BOROUGH | ... | BUILDING CLASS AT PRESENT | ADDRESS | BUILDING CLASS AT TIME OF SALE | TAX CLASS AT TIME OF SALE | DayOfMonth | Month | Year | DayOfWeek | Season | YEAR BUILT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 392.0 | 6.0 | 10009.0 | 5.0 | 0.0 | 5.0 | 1633.0 | 6440.0 | 6625000.0 | 1 | ... | C2 | 153 AVENUE B | C2 | 2 | 19 | 7 | 2017 | Wednesday | Summer | 1900.0 |
| 1 | 399.0 | 26.0 | 10009.0 | 28.0 | 3.0 | 31.0 | 4616.0 | 18690.0 | 532500.0 | 1 | ... | C7 | 234 EAST 4TH STREET | C7 | 2 | 14 | 12 | 2016 | Wednesday | Winter | 1900.0 |
| 2 | 399.0 | 39.0 | 10009.0 | 16.0 | 1.0 | 17.0 | 2212.0 | 7803.0 | 532500.0 | 1 | ... | C7 | 197 EAST 3RD STREET | C7 | 2 | 9 | 12 | 2016 | Friday | Winter | 1900.0 |
| 3 | 402.0 | 21.0 | 10009.0 | 10.0 | 0.0 | 10.0 | 2272.0 | 6794.0 | 3936272.0 | 1 | ... | C4 | 154 EAST 7TH STREET | C4 | 2 | 23 | 9 | 2016 | Friday | Fall | 1913.0 |
| 4 | 404.0 | 55.0 | 10009.0 | 6.0 | 0.0 | 6.0 | 2369.0 | 4615.0 | 8000000.0 | 1 | ... | C2 | 301 EAST 10TH STREET | C2 | 2 | 17 | 11 | 2016 | Thursday | Fall | 1900.0 |
5 rows × 23 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 82807 entries, 0 to 82806 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BLOCK 82807 non-null object 1 LOT 82807 non-null object 2 ZIP CODE 82807 non-null object 3 RESIDENTIAL UNITS 82807 non-null object 4 COMMERCIAL UNITS 82807 non-null object 5 TOTAL UNITS 82807 non-null object 6 LAND SQUARE FEET 82807 non-null object 7 GROSS SQUARE FEET 82807 non-null object 8 SALE PRICE 82807 non-null object 9 BOROUGH 82807 non-null object 10 NEIGHBORHOOD 82807 non-null object 11 BUILDING CLASS CATEGORY 82807 non-null object 12 TAX CLASS AT PRESENT 82807 non-null object 13 BUILDING CLASS AT PRESENT 82807 non-null object 14 ADDRESS 82807 non-null object 15 BUILDING CLASS AT TIME OF SALE 82807 non-null object 16 TAX CLASS AT TIME OF SALE 82807 non-null object 17 DayOfMonth 82807 non-null object 18 Month 82807 non-null object 19 Year 82807 non-null object 20 DayOfWeek 82807 non-null object 21 Season 82807 non-null object 22 YEAR BUILT 82807 non-null object dtypes: object(23) memory usage: 14.5+ MB
separate numeric feature from categorical feature
2. Exploratory Data Analysis (EDA)¶
df[categorical_features_columns].nunique()
BOROUGH 5 NEIGHBORHOOD 253 BUILDING CLASS CATEGORY 47 TAX CLASS AT PRESENT 4 BUILDING CLASS AT PRESENT 164 ADDRESS 67158 BUILDING CLASS AT TIME OF SALE 164 TAX CLASS AT TIME OF SALE 4 DayOfMonth 31 Month 12 Year 2 DayOfWeek 7 Season 4 YEAR BUILT 144 dtype: int64
2.1 Data Visualization¶
def plot_histogram(df, columns, nbins):
# Plot histograms for categorical columns
for column in columns:
fig = px.histogram(df, x=column, nbins=nbins, title=f'Histogram of {column}')
fig.update_layout(**layout_options)
fig.show()
plot_histogram(df, categorical_features_columns, 100)
We noticed there is imbalanced data for columns building class category, building class at present, building class at time of sale , Tax class at present, Tax class at time of sale
encoding : neighgourhood, address
grouping rares: building class category, building class at present, building class at time of sale
def group_rare_categories(df, column, threshold=0.01, other_label="Other"):
"""
Groups rare categories in a specified column of a DataFrame.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
column (str): The column in which to group rare categories.
threshold (float or int): The threshold for grouping rare categories.
If float, it represents the fraction of total records (e.g., 0.01 for 1%).
If int, it represents the absolute number of occurrences.
other_label (str): The label to use for grouped categories.
Returns:
pd.DataFrame: A DataFrame with the rare categories grouped into 'Other'.
"""
# If threshold is a float, convert it to an absolute count threshold
if isinstance(threshold, float):
threshold = threshold * len(df)
# Count the occurrences of each category in the column
category_counts = df[column].value_counts()
# Identify categories that should be grouped
rare_categories = category_counts[category_counts < threshold].index
# Replace rare categories with the 'other_label'
df[column] = df[column].apply(lambda x: other_label if x in rare_categories else x)
return df
grouping_rare_columns = [
'BUILDING CLASS AT TIME OF SALE',
'BUILDING CLASS CATEGORY',
'BUILDING CLASS AT PRESENT',
]
group_rare_categories(df, 'BUILDING CLASS AT TIME OF SALE', threshold=0.01, other_label="Other")
group_rare_categories(df, 'BUILDING CLASS CATEGORY', threshold=0.025, other_label="Other")
group_rare_categories(df, 'BUILDING CLASS AT PRESENT', threshold=0.01, other_label="Other")
plot_histogram(df, grouping_rare_columns, 100)
building_class_present_unique_values = df['BUILDING CLASS AT PRESENT'].unique().tolist()
building_class_category_unique_values = df['BUILDING CLASS CATEGORY'].unique().tolist()
building_class_time_unique_values = df['BUILDING CLASS AT TIME OF SALE'].unique().tolist()
print(building_class_present_unique_values, '\n', building_class_category_unique_values, '\n', building_class_time_unique_values)
['Other', 'C6', 'D4', 'R2', 'R4', 'R1', 'R9', 'A9', 'B3', 'B1', 'C0', 'C3', 'RG', 'B9', 'A5', 'B2', 'R3', 'A1', 'A2', 'V0'] ['07 RENTALS - WALKUP APARTMENTS ', 'Other', '09 COOPS - WALKUP APARTMENTS ', '10 COOPS - ELEVATOR APARTMENTS ', '13 CONDOS - ELEVATOR APARTMENTS ', '01 ONE FAMILY DWELLINGS ', '02 TWO FAMILY DWELLINGS ', '03 THREE FAMILY DWELLINGS '] ['Other', 'C6', 'D4', 'R2', 'R4', 'R1', 'R9', 'A9', 'B3', 'B1', 'C0', 'C3', 'RG', 'B9', 'A5', 'B2', 'R3', 'A1', 'A2', 'V0']
# Save these lists to files
joblib.dump(building_class_present_unique_values, 'building_class_present_unique_values.pkl')
joblib.dump(building_class_category_unique_values, 'building_class_category_unique_values.pkl')
joblib.dump(building_class_time_unique_values, 'building_class_time_unique_values.pkl')
['building_class_time_unique_values.pkl']
now the features BUILDING CLASS AT TIME OF SALE, BUILDING CLASS CATEGORY, BUILDING CLASS AT PRESENT
have almost more balanced data than before
since TAX CLASS AT PRESENT and TAX CLASS AT TIME OF SALE has only 3 values at 3 category, so if the values of these columns are equelevent then we will delete the rows at that values in order to make the distribution of the data more balanced
tax_class_3 = df[df['TAX CLASS AT PRESENT'] == 3]
print(tax_class_3['TAX CLASS AT PRESENT'].values)
print(tax_class_3['TAX CLASS AT TIME OF SALE'].values)
[3.0] [3]
df = df[df["TAX CLASS AT TIME OF SALE"] != 3]
plot_histogram(df, ['TAX CLASS AT TIME OF SALE', 'TAX CLASS AT PRESENT'], nbins=100)
For the value 4 that has low occurences compared to 1 and 2 (Skewed data) we could use Oversampling or Undersampling to handle that variance in the distribution but each one of them has disadvantages:
Undersampling Disadvantages:
-Risk of losing valuable information from the majority class, which could lead to underfitting.
-The model might not have enough data to learn general patterns from the majority class.
Oversampling Disadvantages:
-Random oversampling can lead to overfitting since the model might learn specific duplicated examples rather than general patterns.
-Synthetic oversampling might introduce noise if not carefully applied.
So we will use Randomforest with Cross-validation for model training and selection so that will helps avoid the impact of skewed data on the model.
2.2 Feature Engineering¶
Check for outliers¶
# Create subplots with two columns
def make_box_plot(df, numeric_cols):
fig = sp.make_subplots(rows=len(numeric_cols) // 2 + len(numeric_cols) % 2, cols=2, vertical_spacing=0.03, horizontal_spacing=0.3)
# Add box plots to the subplots
for i, column in enumerate(numeric_cols, start=1):
fig.add_trace(go.Box(y=df[column], name=column), row=(i-1)//2 + 1, col=(i-1)%2 + 1)
# Adjust the layout
fig.update_layout( **layout_options, height=150*len(numeric_cols), width=950)
# Change the color of subplot titles to white
for annotation in fig['layout']['annotations']:
annotation['font'] = dict(color='white')
# Change the color of axis titles and tick labels to white
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.font.color = 'white'
fig.layout[axis].tickfont.color = 'white'
elif type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.font.color = 'white'
fig.layout[axis].tickfont.color = 'white'
pio.renderers.default = "notebook" # Change to "iframe" or "svg" if issues persist
# Show the plot
fig.show()
make_box_plot(df, numeric_cols)
Handling outliers using IQR¶
# Define the columns that you want to check for outliers
outlier_columns = [
'BLOCK',
'LOT',
'RESIDENTIAL UNITS',
'COMMERCIAL UNITS',
'TOTAL UNITS',
'LAND SQUARE FEET',
'GROSS SQUARE FEET',
'SALE PRICE',
]
# Function to remove outliers iteratively
def remove_outliers(df, columns):
for column in columns:
# while True:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
# Define the acceptable range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identify the outliers
# initial_size = df.shape[0]
df[column] = df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound), df[column].median())
# If no rows were removed, break the loop
# if df.shape[0] == initial_size:
# break
return df
# Apply the iterative outlier removal process
df = remove_outliers(df, outlier_columns)
# Check the cleaned dataset
make_box_plot(df, numeric_cols)
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 82806 entries, 0 to 82806 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BLOCK 82806 non-null object 1 LOT 82806 non-null object 2 ZIP CODE 82806 non-null object 3 RESIDENTIAL UNITS 82806 non-null object 4 COMMERCIAL UNITS 82806 non-null object 5 TOTAL UNITS 82806 non-null object 6 LAND SQUARE FEET 82806 non-null object 7 GROSS SQUARE FEET 82806 non-null object 8 SALE PRICE 82806 non-null object 9 BOROUGH 82806 non-null object 10 NEIGHBORHOOD 82806 non-null object 11 BUILDING CLASS CATEGORY 82806 non-null object 12 TAX CLASS AT PRESENT 82806 non-null object 13 BUILDING CLASS AT PRESENT 82806 non-null object 14 ADDRESS 82806 non-null object 15 BUILDING CLASS AT TIME OF SALE 82806 non-null object 16 TAX CLASS AT TIME OF SALE 82806 non-null object 17 DayOfMonth 82806 non-null object 18 Month 82806 non-null object 19 Year 82806 non-null object 20 DayOfWeek 82806 non-null object 21 Season 82806 non-null object 22 YEAR BUILT 82806 non-null object dtypes: object(23) memory usage: 15.2+ MB
it turns out that there is some column that has Extreme Outliers:
The filtering might have removed some outliers, but the remaining data still have a wide range or other extreme values that extend the whiskers of the box plot.
The outliers might be very close to the IQR boundaries, so the filtering doesn't remove them, or they might not be considered outliers by the IQR method.
also we tried another approche which is Z-score, and the result is the same so we will let scaling handle that¶
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 82806 entries, 0 to 82806 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BLOCK 82806 non-null object 1 LOT 82806 non-null object 2 ZIP CODE 82806 non-null object 3 RESIDENTIAL UNITS 82806 non-null object 4 COMMERCIAL UNITS 82806 non-null object 5 TOTAL UNITS 82806 non-null object 6 LAND SQUARE FEET 82806 non-null object 7 GROSS SQUARE FEET 82806 non-null object 8 SALE PRICE 82806 non-null object 9 BOROUGH 82806 non-null object 10 NEIGHBORHOOD 82806 non-null object 11 BUILDING CLASS CATEGORY 82806 non-null object 12 TAX CLASS AT PRESENT 82806 non-null object 13 BUILDING CLASS AT PRESENT 82806 non-null object 14 ADDRESS 82806 non-null object 15 BUILDING CLASS AT TIME OF SALE 82806 non-null object 16 TAX CLASS AT TIME OF SALE 82806 non-null object 17 DayOfMonth 82806 non-null object 18 Month 82806 non-null object 19 Year 82806 non-null object 20 DayOfWeek 82806 non-null object 21 Season 82806 non-null object 22 YEAR BUILT 82806 non-null object dtypes: object(23) memory usage: 15.2+ MB
For some features We will encode the categorica feature using Frequency Encoding because we have high cardinality categorical features (some features have large amount of categories and this will a large amount of columns if we use One Hot Encoder), also Label Encoder is bad since we have a large amount of categories, Target Encoding is powerful but risk overfitting.¶
for other features that does not have high cardinality we will use Label Encoder.¶
# Initialize separate LabelEncoders for each column
label_encoder_dayofweek = LabelEncoder()
label_encoder_season = LabelEncoder()
label_encoder_bldg_class_at_sale = LabelEncoder()
label_encoder_bldg_class_cat = LabelEncoder()
label_encoder_bldg_class_present = LabelEncoder()
# Fit and transform each column with its own LabelEncoder
df['DayOfWeek'] = label_encoder_dayofweek.fit_transform(df['DayOfWeek'])
df['Season'] = label_encoder_season.fit_transform(df['Season'])
df['BUILDING CLASS AT TIME OF SALE'] = label_encoder_bldg_class_at_sale.fit_transform(df['BUILDING CLASS AT TIME OF SALE'])
df['BUILDING CLASS CATEGORY'] = label_encoder_bldg_class_cat.fit_transform(df['BUILDING CLASS CATEGORY'])
df['BUILDING CLASS AT PRESENT'] = label_encoder_bldg_class_present.fit_transform(df['BUILDING CLASS AT PRESENT'])
# Save each LabelEncoder individually
joblib.dump(label_encoder_dayofweek, 'label_encoder_dayofweek.pkl')
joblib.dump(label_encoder_season, 'label_encoder_season.pkl')
joblib.dump(label_encoder_bldg_class_at_sale, 'label_encoder_bldg_class_at_sale.pkl')
joblib.dump(label_encoder_bldg_class_cat, 'label_encoder_bldg_class_cat.pkl')
joblib.dump(label_encoder_bldg_class_present, 'label_encoder_bldg_class_present.pkl')
['label_encoder_bldg_class_present.pkl']
# Save frequency encodings
joblib.dump(df['ADDRESS'].value_counts() / len(df), 'address_freq_map.pkl')
joblib.dump(df['NEIGHBORHOOD'].value_counts() / len(df), 'neighborhood_freq_map.pkl')
['neighborhood_freq_map.pkl']
def encoding_with_frequency(df, column):
# Calculate the frequency of each category
frequency_encoding = df[column].value_counts() / len(df)
# Map the frequencies to the original column
df[column] = df[column].map(frequency_encoding)
encoding_with_frequency(df, 'ADDRESS')
encoding_with_frequency(df, 'NEIGHBORHOOD')
# Create a correlation matrix
corr_matrix = df.corr()
# Set the figure size
plt.figure(figsize=(10, 10), facecolor='k')
# Set the style of the plot
sns.set_theme(style="dark")
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 25, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
ax = sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
# Set the color of the labels to white and bring them to the front
ax.tick_params(colors='white', axis='y', labelrotation=0)
ax.tick_params(colors='white', axis='x', labelrotation=90)
# Change the color of the color bar ticks and labels
cbar = ax.collections[0].colorbar
cbar.ax.yaxis.set_tick_params(color='white') # Change tick color
plt.setp(cbar.ax.get_yticklabels(), color='white') # Change tick label color
# Set the title and its color
plt.title('Correlation Matrix', color='white')
plt.show()
Normal Positive Correlations:¶
TOTAL UNITS and RESIDENTIAL UNITS: There appears to be a strong positive correlation, indicating that as the number of residential units increases, the total units also increase, which makes sense because residential units contribute to the total unit count, But its not string relationship so no action needed.
Weak or No Correlations:¶
Many pairs of variables show little to no correlation, as indicated by values close to 0 in the matrix. This implies that those variables do not have a linear relationship.
High Negative Correlations:¶
Year and Month: The negative correlation could arise if these variables are being treated in a way that doesn't align with their typical meanings. For example, if Year is represented as a large number (e.g., 2023) and Month is represented as a smaller number (e.g., 1 for January, 12 for December), there might be an unintended mathematical relationship, but also no action needed>
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Fit the data
scaler = scaler.fit(df)
# Transform the data
scaled_data = scaler.transform(df)
# Create a DataFrame from the scaled data
df = pd.DataFrame(scaled_data, columns=df.columns)
df.describe()
| BLOCK | LOT | ZIP CODE | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | SALE PRICE | BOROUGH | ... | BUILDING CLASS AT PRESENT | ADDRESS | BUILDING CLASS AT TIME OF SALE | TAX CLASS AT TIME OF SALE | DayOfMonth | Month | Year | DayOfWeek | Season | YEAR BUILT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.0 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | ... | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.000000 | 82806.00000 | 82806.000000 | 82806.000000 | 82806.000000 |
| mean | 0.294635 | 0.169374 | 0.506870 | 0.215709 | 0.0 | 0.364325 | 0.491816 | 0.507263 | 0.313295 | 0.499520 | ... | 0.475090 | 0.024631 | 0.475745 | 0.215910 | 0.509863 | 0.506399 | 0.64953 | 0.546832 | 0.493897 | 0.586542 |
| std | 0.237415 | 0.266313 | 0.332221 | 0.188529 | 0.0 | 0.272165 | 0.129825 | 0.138052 | 0.209921 | 0.321786 | ... | 0.283006 | 0.100278 | 0.283930 | 0.268929 | 0.293995 | 0.315787 | 0.47712 | 0.379027 | 0.374666 | 0.201469 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.096739 | 0.011641 | 0.180154 | 0.000000 | 0.0 | 0.333333 | 0.498828 | 0.487449 | 0.183206 | 0.250000 | ... | 0.210526 | 0.000000 | 0.210526 | 0.000000 | 0.266667 | 0.272727 | 0.00000 | 0.166667 | 0.000000 | 0.419162 |
| 50% | 0.242212 | 0.027162 | 0.714117 | 0.200000 | 0.0 | 0.333333 | 0.498828 | 0.487449 | 0.325191 | 0.500000 | ... | 0.578947 | 0.000000 | 0.578947 | 0.333333 | 0.500000 | 0.454545 | 1.00000 | 0.666667 | 0.333333 | 0.538922 |
| 75% | 0.437733 | 0.126940 | 0.800945 | 0.400000 | 0.0 | 0.666667 | 0.498828 | 0.487449 | 0.385536 | 0.750000 | ... | 0.631579 | 0.000000 | 0.631579 | 0.333333 | 0.766667 | 0.818182 | 1.00000 | 0.833333 | 1.000000 | 0.694611 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 23 columns
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
['scaler.pkl']
3. Modeling¶
3.1 Splitting data¶
y = df['SALE PRICE']
X = df.drop('SALE PRICE', axis=1)
# Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
3.2 Model Evaluation and Training¶
# Define a dictionary of models and their hyperparameters for grid search
model_params = {
'Linear Regression': {
'model': LinearRegression(),
'params': {}
},
'Ridge Regression': {
'model': Ridge(),
'params': {
'model__alpha': [0.01, 0.1, 1.0, 10.0],
'model__max_iter': [1000, 5000, 10000]
}
},
'Lasso Regression': {
'model': Lasso(),
'params': {
'model__alpha': [0.01, 0.1, 1.0, 10.0],
'model__max_iter': [1000, 5000, 10000]
}
},
'Decision Tree': {
'model': DecisionTreeRegressor(),
'params': {
'model__max_depth': [5, 10, 15, 20],
'model__min_samples_split': [2, 5, 10]
}
},
'Random Forest': {
'model': RandomForestRegressor(),
'params': {
'model__n_estimators': [50, 100, 200],
'model__max_depth': [5, 10, 15],
'model__min_samples_split': [2, 5, 10]
}
},
'Gradient Boosting': {
'model': GradientBoostingRegressor(),
'params': {
'model__n_estimators': [50, 100, 200],
'model__learning_rate': [0.01, 0.1, 0.2],
'model__max_depth': [3, 5, 7]
}
},
'SGD Regressor': {
'model': SGDRegressor(),
'params': {
'model__alpha': [0.0001, 0.001, 0.01, 0.1],
'model__max_iter': [1000, 5000, 10000],
'model__penalty': ['l2', 'l1', 'elasticnet'],
'model__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
}
}
}
# Initialize a dictionary to store the best models and results
best_models = {}
results = {}
# Perform RandomizedSearchCV for each model to reduce computation time
for name, model_info in model_params.items():
start_time = time.time()
# Create a pipeline with MinMaxScaler and the model
pipeline = Pipeline([
# ('scaler', MinMaxScaler()),
('model', model_info['model'])
])
# Use RandomizedSearchCV for faster hyperparameter tuning
randomized_search = RandomizedSearchCV(pipeline, model_info['params'], cv=5,
scoring='neg_mean_squared_error', n_jobs=-1,
n_iter=10, # Set the number of iterations for random search
random_state=42)
randomized_search.fit(X_train, y_train)
# Store the best model and its performance
best_models[name] = randomized_search.best_estimator_
results[name] = {
'best_score': -randomized_search.best_score_,
'best_params': randomized_search.best_params_
}
elapsed_time = time.time() - start_time
print(f"{name}: Best MSE = {-randomized_search.best_score_}, Best Params = {randomized_search.best_params_} (Time: {elapsed_time:.2f}s)")
# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results).T
print("\nSummary of Grid Search Results:")
print(results_df)
# Select the best model based on the mean cross-validation score (lower is better for MSE)
best_model_name = results_df['best_score'].idxmin()
best_model = best_models[best_model_name]
print(f"\nThe best model is {best_model_name} with a mean CV MSE of {results_df.loc[best_model_name, 'best_score']}.")
# Optionally, evaluate the best model on the test set
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print(f"Test Set MSE for the best model ({best_model_name}): {test_mse}")
c:\Users\omarq\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py:320: UserWarning: The total space of parameters 1 is smaller than n_iter=10. Running 1 iterations. For exhaustive searches, use GridSearchCV.
Linear Regression: Best MSE = 0.04062624191455223, Best Params = {} (Time: 8.74s)
Ridge Regression: Best MSE = 0.04062567175441606, Best Params = {'model__max_iter': 5000, 'model__alpha': 10.0} (Time: 3.38s)
Lasso Regression: Best MSE = 0.04399376543126933, Best Params = {'model__max_iter': 5000, 'model__alpha': 10.0} (Time: 1.83s)
Decision Tree: Best MSE = 0.03364297302618432, Best Params = {'model__min_samples_split': 10, 'model__max_depth': 10} (Time: 8.18s)
Random Forest: Best MSE = 0.031053158443675576, Best Params = {'model__n_estimators': 50, 'model__min_samples_split': 10, 'model__max_depth': 15} (Time: 305.41s)
Gradient Boosting: Best MSE = 0.03111158332197923, Best Params = {'model__n_estimators': 200, 'model__max_depth': 7, 'model__learning_rate': 0.1} (Time: 339.89s)
SGD Regressor: Best MSE = 0.04062786380200895, Best Params = {'model__penalty': 'l2', 'model__max_iter': 5000, 'model__learning_rate': 'adaptive', 'model__alpha': 0.001} (Time: 39.29s)
Summary of Grid Search Results:
best_score \
Linear Regression 0.040626
Ridge Regression 0.040626
Lasso Regression 0.043994
Decision Tree 0.033643
Random Forest 0.031053
Gradient Boosting 0.031112
SGD Regressor 0.040628
best_params
Linear Regression {}
Ridge Regression {'model__max_iter': 5000, 'model__alpha': 10.0}
Lasso Regression {'model__max_iter': 5000, 'model__alpha': 10.0}
Decision Tree {'model__min_samples_split': 10, 'model__max_d...
Random Forest {'model__n_estimators': 50, 'model__min_sample...
Gradient Boosting {'model__n_estimators': 200, 'model__max_depth...
SGD Regressor {'model__penalty': 'l2', 'model__max_iter': 50...
The best model is Random Forest with a mean CV MSE of 0.031053158443675576.
Test Set MSE for the best model (Random Forest): 0.03115630115433893
joblib.dump(best_model, 'nyc_property_price_model.pkl')
['nyc_property_price_model.pkl']
df.head()
| BLOCK | LOT | ZIP CODE | RESIDENTIAL UNITS | COMMERCIAL UNITS | TOTAL UNITS | LAND SQUARE FEET | GROSS SQUARE FEET | SALE PRICE | BOROUGH | ... | BUILDING CLASS AT PRESENT | ADDRESS | BUILDING CLASS AT TIME OF SALE | TAX CLASS AT TIME OF SALE | DayOfMonth | Month | Year | DayOfWeek | Season | YEAR BUILT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.028526 | 0.002772 | 0.004725 | 1.0 | 0.0 | 0.333333 | 0.231641 | 0.487449 | 0.325191 | 0.0 | ... | 0.631579 | 0.0 | 0.631579 | 0.333333 | 0.600000 | 0.545455 | 1.0 | 1.000000 | 0.666667 | 0.299401 |
| 1 | 0.029036 | 0.013858 | 0.004725 | 0.2 | 0.0 | 0.333333 | 0.498828 | 0.487449 | 0.325191 | 0.0 | ... | 0.631579 | 0.0 | 0.631579 | 0.333333 | 0.433333 | 1.000000 | 0.0 | 1.000000 | 1.000000 | 0.299401 |
| 2 | 0.029036 | 0.021064 | 0.004725 | 0.2 | 0.0 | 0.333333 | 0.457813 | 0.487449 | 0.325191 | 0.0 | ... | 0.631579 | 0.0 | 0.631579 | 0.333333 | 0.266667 | 1.000000 | 0.0 | 0.000000 | 1.000000 | 0.299401 |
| 3 | 0.029255 | 0.011086 | 0.004725 | 0.2 | 0.0 | 0.333333 | 0.481250 | 0.487449 | 0.325191 | 0.0 | ... | 0.631579 | 0.0 | 0.631579 | 0.333333 | 0.733333 | 0.727273 | 0.0 | 0.000000 | 0.000000 | 0.377246 |
| 4 | 0.029401 | 0.029933 | 0.004725 | 0.2 | 0.0 | 0.333333 | 0.519141 | 0.487449 | 0.325191 | 0.0 | ... | 0.631579 | 0.0 | 0.631579 | 0.333333 | 0.533333 | 0.909091 | 0.0 | 0.666667 | 0.000000 | 0.299401 |
5 rows × 23 columns
def group_rare_categories(df, column, unique_values, other_label="Other"):
"""
Groups rare categories in a specified column of a DataFrame according to pre-determined unique values.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
column (str): The column in which to group rare categories.
unique_values (list): The list of unique values to keep.
other_label (str): The label to use for grouped categories.
Returns:
pd.DataFrame: A DataFrame with the rare categories grouped into 'Other'.
"""
df[column] = df[column].apply(lambda x: x if x in unique_values else other_label)
return df
# Function to preprocess the data
def preprocess_data(df):
# Drop the 'Unnamed: 0' column
if 'Unnamed: 0' in df.columns:
df.drop('Unnamed: 0', axis=1, inplace=True)
# Convert 'SALE DATE' to datetime format
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
# Extract numeric features
df['DayOfMonth'] = df['SALE DATE'].dt.day
df['Month'] = df['SALE DATE'].dt.month
df['Year'] = df['SALE DATE'].dt.year
# Extract categorical features
df['DayOfWeek'] = df['SALE DATE'].dt.day_name()
df['Season'] = df['SALE DATE'].dt.month.apply(lambda x: 'Spring' if 3 <= x <= 5
else ('Summer' if 6 <= x <= 8
else ('Fall' if 9 <= x <= 11
else 'Winter')))
df.drop('SALE DATE', axis=1, inplace=True)
# Drop other unnecessary columns
df.drop(['APARTMENT NUMBER', 'EASE-MENT'], axis=1, inplace=True)
#Load and apply grouping of rare categories
building_class_present_unique_values = joblib.load('building_class_present_unique_values.pkl')
building_class_category_unique_values = joblib.load('building_class_category_unique_values.pkl')
building_class_time_unique_values = joblib.load('building_class_time_unique_values.pkl')
df = group_rare_categories(df, 'BUILDING CLASS AT PRESENT', building_class_present_unique_values)
df = group_rare_categories(df, 'BUILDING CLASS CATEGORY', building_class_category_unique_values)
df = group_rare_categories(df, 'BUILDING CLASS AT TIME OF SALE', building_class_time_unique_values)
# Load label encoders
le_dayofweek = joblib.load('label_encoder_dayofweek.pkl')
le_season = joblib.load('label_encoder_season.pkl')
le_bldg_class_at_sale = joblib.load('label_encoder_bldg_class_at_sale.pkl')
le_bldg_class_cat = joblib.load('label_encoder_bldg_class_cat.pkl')
le_bldg_class_present = joblib.load('label_encoder_bldg_class_present.pkl')
# print(df.head())
#Apply the encodings using the loaded label encoders
df['DayOfWeek'] = le_dayofweek.transform(df['DayOfWeek'])
df['Season'] = le_season.transform(df['Season'])
df['BUILDING CLASS AT TIME OF SALE'] = le_bldg_class_at_sale.transform(df['BUILDING CLASS AT TIME OF SALE'])
df['BUILDING CLASS CATEGORY'] = le_bldg_class_cat.transform(df['BUILDING CLASS CATEGORY'])
df['BUILDING CLASS AT PRESENT'] = le_bldg_class_present.transform(df['BUILDING CLASS AT PRESENT'])
# Apply frequency encoding
def encoding_with_frequency(df, column, frequency_map):
df[column] = df[column].map(frequency_map)
return df
# Load frequency encoding mappings
address_freq_map = joblib.load('address_freq_map.pkl')
neighborhood_freq_map = joblib.load('neighborhood_freq_map.pkl')
df = encoding_with_frequency(df, 'ADDRESS', address_freq_map)
df = encoding_with_frequency(df, 'NEIGHBORHOOD', neighborhood_freq_map)
df.head()
# Reorder columns the match data order before enter previuos scaling
new_order = ['BLOCK',
'LOT',
'ZIP CODE',
'RESIDENTIAL UNITS',
'COMMERCIAL UNITS',
'TOTAL UNITS',
'LAND SQUARE FEET',
'GROSS SQUARE FEET',
'SALE PRICE',
'BOROUGH',
'NEIGHBORHOOD',
'BUILDING CLASS CATEGORY',
'TAX CLASS AT PRESENT',
'BUILDING CLASS AT PRESENT',
'ADDRESS',
'BUILDING CLASS AT TIME OF SALE',
'TAX CLASS AT TIME OF SALE',
'DayOfMonth',
'Month',
'Year',
'DayOfWeek',
'Season',
'YEAR BUILT']
df = df[new_order]
# Load and apply the MinMaxScaler
scaler = joblib.load('scaler.pkl')
scaled_data = scaler.transform(df)
# Convert scaled data back to DataFrame for easier handling
df = pd.DataFrame(scaled_data, columns=df.columns)
print(df['SALE PRICE'])
return df
processed_data = preprocess_data(test_data_after_save_model.copy())
processed_data.drop('SALE PRICE', axis=1, inplace=True)
#Load the trained model
model = joblib.load('nyc_property_price_model.pkl')
# Make predictions on the preprocessed new data
predictions = model.predict(processed_data)
print(predictions)
0 0.274809 1 0.335878 2 0.280916 3 7.140969 4 0.042321 Name: SALE PRICE, dtype: float64 [0.35252914 0.33057194 0.29541667 0.4037396 0.31013076]